# install.packages("tidyverse");
# install.packages("rgdal");
library(tidyverse)
require("maps")
library(geosphere)
library(stringr)
library(rgdal)
library(caret)
library(lubridate)
library(maptools)
if (!require(ggmap)) { install.packages('ggmap'); require(ggmap) }
library(ggmap)
path.to.csv <- '~/Downloads/Seattle_Police_Department_911_Incident_Response (1).csv'
spd.911 <- read.csv(path.to.csv, header = TRUE)
spd.911 <- spd.911 %>%
rowwise() %>%
mutate(dist=distVincentyEllipsoid(c(Longitude, Latitude), c(here_long, here_lat)))
nrow(spd.911)
[1] 257011
descriptions <- c("STRONG ARM ROBBERY", "PERSON WITH A WEAPON (NOT GUN)", "HAZARDS", "HARASSMENT, THREATS", "FIGHT DISTURBANCE", "CRISIS COMPLAINT - GENERAL", "ARMED ROBBERY")
# Removes Specifically Harassment by Telephone and Writing, as well as other non-scary crimes
data.ped <- spd.911 %>% filter(str_detect(Event.Clearance.Description, paste(descriptions, collapse="|"))) %>% filter(!str_detect(Event.Clearance.Description, "HARASSMENT, THREATS - BY TELEPHONE, WRITING")) %>% filter(!str_detect(Event.Clearance.Description, "HARBOR DEBRIS, NAVIGATIONAL HAZARDS"))
nrow(data.ped)
[1] 15606
data.here <- data.ped %>% filter(dist < 2600)
data.w.at.scene <- filter(data.here, !is.na(at_scene_time_date))
data <- data.w.at.scene
nrow(data)
[1] 722
# View(data)
write.csv(data, '2016-2017-Clean.csv')
Time is a huge factor when discussing pedestrian safety in an error, or so we’re told. Common wisdom states that night time is more dangerous than day time, but is this even true? When do crimes get reported, and how does that change where the centers of crime are located. Here we look at a year’s worth of SPD data in order to gain some insight.
data <- read.csv('2016-2017-Clean.csv', header = TRUE)
# View(data)
data <- filter(data, !str_detect(Event.Clearance.Description, "HARBOR - DEBRIS, NAVIGATIONAL HAZARDS"))
nrow(data)
ggmap(seattle) +
geom_point(data = data, aes(x = Longitude, y = Latitude), colour = "red", alpha = 0.75)
First of, we want to make sure we use as much data as possible. Using reports for all years tract would be ideal, but that data set is too large to handle easily. Instead, we’d like to use just the past year’s worth, from November 1st, 2016, all the way to October 31, 2017. That gives us a full year’s worth of data to look at, and its far enough in the past from today that we can ensure most, if not all, incidences will be closed (and therefore included in the data set).
Before we go any further though, it is important we determine whether or not the time of year has any meaningful effect on the number of observations we have to work with. If several months have much higher crime rates than others, it may skew results of any analysis we do. With that in mind, let’s take a look at the distribution of crimes for each month in the last year:
k
Kruskal-Wallis rank sum test
data: Freq by Var1
Kruskal-Wallis chi-squared = 11, df = 11, p-value = 0.4433
As we can see, there isn’t much varience in the frequency of reported crimes in our area for the past year.
freq_by_desc <- table(droplevels(data$Event.Clearance.Description))
# View(freq_by_desc)
ggplot(as.data.frame(freq_by_desc),
aes(x = Var1, y = Freq)) +
geom_bar(stat = 'identity') +# create bar plot
coord_flip()
#Traffic related calls, suspicious circumstances, and disturbances are the the most significant threats to pedestrations
ggmap(seattle) +
geom_point(data = data, aes(x = Longitude, y = Latitude, group = Event.Clearance.Description, color = Event.Clearance.Description), alpha = 0.5, size = 10) +
facet_wrap(~ Event.Clearance.Description) +
theme(axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
strip.text = element_text(size=50),
legend.position = "none"
)